In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !cp "drive/MyDrive/Question Generation/vae/models.py" .
%cd '/content/drive/MyDrive/Question Generation/vae/'
!pwd

/content/drive/MyDrive/Question Generation/vae
/content/drive/MyDrive/Question Generation/vae


In [3]:
!nvcc --version

import torch
print("\nPytorch version: ", torch.__version__)

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0

Pytorch version:  2.0.0+cu118


In [4]:
!pip install transformers
## scatter 1.12+cu113
# !pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.1+cu113.html
# scatter 1.13+cu116
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu118.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [5]:
import argparse
import json
import os

from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm
from transformers import BertTokenizer
import numpy as np
from models2 import DiscreteVAE

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [7]:
class CustomDatset(Dataset):
    def __init__(self, tokenizer, input_file, max_length=512,maxParaCount=float("inf"),myData=[]):
        with open(input_file) as f:
          data = json.load(f)
        assert("context" in data['data'][0]['paragraphs'][0])
      
        # self.lines = open(input_file, "r").readlines()
        self.lines = []
        self.trueQA = []
        # for 

        lines = [(paragraph['context'],[qa['question'] for qa in paragraph['qas']],[[a['text'] for a in qa['answers']] for qa in paragraph['qas']]) for topic in data['data'] for paragraph in topic['paragraphs']]
        indices = np.sort(np.random.choice(np.arange(len(lines)), min(len(lines),maxParaCount), replace=False))

        
        for i in range(len(myData)):
          self.lines.append(myData[i])
          self.trueQA.append(('N/A','N/A'))

        for i in indices:
          self.lines.append(lines[i][0])                                                                                          
          self.trueQA.append((lines[i][1],lines[i][2]))
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_total = len(self.lines)
        print("Number of contexts: ", self.num_total)

    def __getitem__(self, idx):
        context = self.lines[idx].strip()
        ## tokenize the entire line
        tokens = self.tokenizer.tokenize(context)[:self.max_length]
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        # padding up to the maximum length
        while len(ids) < self.max_length:
            ids.append(0)
        ids = torch.tensor(ids, dtype=torch.long)
        
        return ids

    def __len__(self):
        return self.num_total

In [8]:
def loadModel(args):
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)
    data = CustomDatset(tokenizer, args.data_file, args.max_length, args.maxParaCount)
    data_loader = DataLoader(data, shuffle=False, batch_size=args.batch_size)

    device = torch.cuda.current_device()
    checkpoint = torch.load(args.checkpoint, map_location="cpu")
    vae = DiscreteVAE(checkpoint["args"])
    vae.load_state_dict(checkpoint["state_dict"])
    vae.eval()
    vae = vae.to(device)

    return tokenizer, data_loader, vae

def main(args, tokenizer, data_loader, vae,tsne=False):
    
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    output_file = os.path.join(args.output_dir, "synthetic_qa.jsonl")
    output_file2 = os.path.join(args.output_dir, "tsne.jsonl")
    
    genPairs = dict()
    fw = open(output_file, "w")
    if tsne:
      fw2 = open(output_file2, "w")
    default = None
    shift = 0
   # print("len(data_loader): " , len(data_loader))
    for batch in tqdm(data_loader, total=len(data_loader)):
        c_ids = batch
        #print("\nc_ids.shape: ", c_ids.shape)
        ## why do we sum torch.sign to get len? A: To exclude padding
        c_len = torch.sum(torch.sign(c_ids),1 )
        max_c_len = torch.max(c_len)
        c_ids = c_ids[:, :max_c_len].to(device)
        numQs = 1
        numAs = 3
        # sample latent variable K times
        for _ in range(args.k):
            testQ = []
            with torch.no_grad():
              selected_qs = None
              for q in range(numQs):
                  ## prior takes context, and returns latent for q and a
                  zq_mu, zq_logvar, zq,c_hs,c_h = vae.prior_encoder.forwardQ(c_ids)
                  # print("zq_mu.shape: ", zq_mu.shape)
                  # print("zq_logvar.shape: ", zq_logvar.shape)
                  # print("\nmean: ", torch.mean(zq_mu))
                  # print("logvar: ", torch.mean(zq_logvar))
                  # print("Q-var: ", zq_logvar)
                  testA = []
                  # c_ids_perm = c_ids[np.roll(np.arange(args.batch_size),shift),:]
                  for i in range(numAs): 
                      a_ids = torch.zeros(c_ids.shape).to(device).long()
                      za_prob, za = vae.prior_encoder.forwardA(c_ids, zq,c_hs,c_h)

                      # _c_ids = c_ids[i].cpu().tolist()
                      batch_q_ids, batch_start, batch_end = vae.generate(
                          zq, za, c_ids)
                      if selected_qs == None:
                        selected_qs = batch_q_ids
                      # for i in range(c_ids.size(0)):
                      #   a_ids[i,0] = 101                        
                      #   a_ids[i, 0: batch_end[i]-batch_start[i]+1] = c_ids_perm[i,batch_start[i]: batch_end[i]+1]
                      #   a_ids[i,batch_end[i]-batch_start[i]+1] = 102
                      # loss_info, loss_q_rec = vae.forward(c_ids_perm, selected_qs, a_ids, batch_start, batch_end)
                      #_, _, zq2, _, za2 = vae.prior_encoder(c_ids)
                      #za = za1 + za2

                      ## we attempt to generate from the latents
                      testA.append((selected_qs, batch_start, batch_end,zq,za))#,loss_info, loss_q_rec))
                  testQ.append(testA)
              for testI, testA in enumerate(testQ):
                for (batch_q_ids, batch_start, batch_end,zq,za) in testA:
                    if tsne==True:
                      questions = [tokenizer.decode(batch_q_ids[i].cpu().tolist(), skip_special_tokens=True) for i in range(c_ids.size(0))]
                      json_d = {
                          "questions":questions,
                          "zq":zq.tolist()
                      }
                      fw2.write(json.dumps(json_d) + "\n")

                      fw2.flush()

                    for i in range(c_ids.size(0)):
                        new_c_ids = c_ids[i].cpu().tolist()
                        _c_ids = c_ids[i].cpu().tolist()
                        q_ids = batch_q_ids[i].cpu().tolist()
                        start_pos = batch_start[i].item()
                        end_pos = batch_end[i].item()
                        
                        a_ids = _c_ids[start_pos: end_pos+1]
                        # loss_info, loss_q_rec = vae.forward(c_ids, batch_q_ids[i].unsqueeze(0), c_ids[i][start_pos: end_pos+1].unsqueeze(0), start_pos, end_pos)
                        c_text_new = tokenizer.decode(new_c_ids, skip_special_tokens=True)
                        c_text = tokenizer.decode(_c_ids, skip_special_tokens=True)
                        q_text = tokenizer.decode(q_ids, skip_special_tokens=True)
                        a_text = tokenizer.decode(a_ids, skip_special_tokens=True)
                        json_dict = {
                            "test_index":testI,
                            "new context":c_text_new,
                            "context":c_text,
                            "question": q_text,
                            "answer": a_text,
                            "zq": zq.tolist(),
                            "za":za.tolist(),
                            # "loss_info": loss_info,
                            # "loss_q_rec": loss_q_rec
                        }
                        if c_text in genPairs: 
                            genPairs[c_text].append(json_dict)
                        else: 
                            genPairs[c_text] = [json_dict] 
                        fw.write(json.dumps(json_dict) + "\n")

                        fw.flush()

    fw.close()
    if tsne:
      fw2.close()
    return genPairs

In [9]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

def setArguments(dataFile, maxParaCount=float("inf"), seed=1004, bert_model="bert-base-uncased", max_length=384, batch_size=64, ratio=1, k = 1):
    args = dict()
    args['maxParaCount'] = maxParaCount
    args['seed'] = seed
    args['bert_model'] = bert_model
    args['max_length'] = max_length
    args['batch_size'] = batch_size
    args['data_file'] = dataFile
    args['checkpoint'] = "../save/vae-checkpoint/best_f1_model.pt"
    args['output_dir'] = "../data/synthetic_data/"
    args['ratio'] = ratio
    args['k'] = k
    
    return dotdict(args)    

In [10]:
k = 1
args = setArguments(dataFile='../data/squad/my_test.json',maxParaCount=128,k=k)
tokenizer, data_loader, vae = loadModel(args)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Number of contexts:  128


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.La

In [15]:
pytorch_total_params = sum(p.numel() for p in vae.parameters() if p.requires_grad)

In [16]:
pytorch_total_params

48474539

In [None]:
genPairs = main(args, tokenizer, data_loader, vae,tsne=False)

100%|██████████| 2/2 [00:29<00:00, 14.59s/it]


In [None]:
print(len(genPairs))
print(genPairs[list(genPairs.keys())[0]][0].keys())

128
dict_keys(['test_index', 'new context', 'context', 'question', 'answer', 'zq', 'za'])


In [None]:
index = 9

dictIndex = list(genPairs.keys())[index]
dictIndexNew = list(genPairs.keys())[index+10]
# the below line, we can alter with the middle index and the output should stay the same
genPairs[dictIndex][0]['context']

'tesla noted the hazards of working with his circuit and single - node x - ray - producing devices. in his many notes on the early investigation of this phenomenon, he attributed the skin damage to various causes. he believed early on that damage to the skin was not caused by the roentgen rays, but by the ozone generated in contact with the skin, and to a lesser extent, by nitrous acid. tesla incorrectly believed that x - rays were longitudinal waves, such as those produced in waves in plasmas. these plasma waves can occur in force - free magnetic fields.'

In [None]:

genPairs[dictIndex][0]['new context']

In [None]:
print("Questions:")
for i in range(len(genPairs[dictIndex])):
    print(genPairs[dictIndex][i]['test_index'], genPairs[dictIndex][i]['question'])

In [None]:
print("Answers:")
for i in range(len(genPairs[dictIndex])):
    print("question index ": genPairs[dictIndex][i]['test_index'], ". info_loss": genPairs[dictIndex][i]['info_loss'],". ",genPairs[dictIndex][i]['answer'])

In [None]:
print("True Questions:")
trueDictIndex = dictIndex.lower()
data_loader.dataset.trueQA[index][0]

In [None]:
print("True Answers:")
data_loader.dataset.trueQA[index][1]

In [None]:

with open("../data/synthetic_data/tsne.jsonl",'r') as f:
  lines = f.readlines()
  print(len(lines))
  X = np.zeros((512,50))
  y = []
  hows = []
  whats = []
  whys = []
  misc = []
  i = 0
  for row,line in enumerate(lines):
    data = json.loads(line)
    print(f"{row*64}:{(row+1)*64}")
    X[row*64:(row+1)*64,:] = data['zq']
    for question in data['questions']:
      if "why" in question.lower():
          y.append("why")
      elif "how" in question.lower():
          y.append("how")
      elif "what" in question.lower():
          y.append("what")
      else:
          y.append("misc")
      i += 1



In [None]:
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
# import seaborn as sns
# import numpy as np
# import matplotlib.pyplot as plt 
# import pandas as pd
# tsne = TSNE(n_components=2, verbose=1,early_exaggeration=100)
# tsne_results = tsne.fit_transform(X)
# # visualize
# df_tsne = pd.DataFrame(tsne_results, columns=['comp1', 'comp2'])
# df_tsne['label'] = y
# sns.lmplot(x='comp1', y='comp2', data=df_tsne, hue='label', fit_reg=False)