In [None]:
import torch

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Not connected to a GPU
No GPU available, using the CPU instead.


In [None]:
!pip install transformers wget datasets sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 7.4 MB/s 
[?25hCollecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 37.8 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 43.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 58.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 37.1 MB/s 
C

In [None]:
from datasets import load_dataset, load_metric
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import math
from google.colab import drive, auth
import os
import gc
import numpy as np
import random

# Mount your Google Drive to that Colab VM's file system.
I packaged the tools needed by QG into the data_provider.zip compressed package. Now we need to unzip it to the filesystem of the colab VM.

After we mount Google Drive to Colab, we can view the Drives' files in /content/drive/MyDrive.

In [None]:
base_dir = '/content/drive'
mount_dir = base_dir + '/MyDrive'
if not os.path.exists(mount_dir):
  auth.authenticate_user()
  drive.mount(base_dir)
  
if not os.path.exists('./data_provider/'):
  !unzip -q /content/drive/MyDrive/nlp/data_provider.zip

Mounted at /content/drive


# Download some necessary data of NLTK.
These data would be used by Keyword Extractor.

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Add data_provider's scripts to python's package search pathes and import them.
Because these scripts are not in python's package search pathes, we need to add them into it, so that we can use these tools.

In [None]:
import sys
sys.path.append('./data_provider')
from data_provider.data_loader import QALoader
from data_provider.keyword_extractor import KeywordExtractor

In [None]:
doc_path = './data_provider/docs/constellations/gemini_constellation.txt'
with open (doc_path, 'r') as f:
  doc_content = f.read()

# KeywordExtractor
### This is a tool to sort out keywords from an article. Because the T5/Bert model's input size is limited, we can't put the whole article text into the model, so we need to split an article into several paragraphs and generate a bunch of questions paragraph by paragraph. 
### So every time this tool sort out keywords, the range is a certain paragraph in the article
## Methods:
1. **num_paragraph()**: return the number of paragraphs in the input article.
2. **get_paragraph(pid)**: return the pid-th paragraph text. It would be used as the context to generate questions.
3. **get_keywords(pid)**: return a list of keywords and their scores. These keywords would be used as answers to generate questions.

In [None]:
ke = KeywordExtractor(doc_content)
print('Number of paragraph: {}'.format(ke.num_paragraph()))

Number of paragraph: 20


In [None]:
target_paragraph = 0

In [None]:
kws = ke.get_keywords(target_paragraph)
kws

[('northern celestial', 1.0),
 ('Pollux Greek', 0.9939389450602522),
 ('celestial hemisphere', 0.9728328182783299),
 ('Castor Pollux', 0.9692334304332942),
 ('twins Castor', 0.9656010071829543),
 ('hemisphere one', 0.9620528469565407),
 ('one 48', 0.9524222697072315),
 ('associated twins', 0.9455462800952672),
 ('twins associated', 0.9419163207342288),
 ('48 constellations', 0.9294900176835356),
 ('constellations described', 0.9269485545756688),
 ('Latin twins', 0.9220864570306636),
 ('described 2nd', 0.9213788987596715),
 ('name Latin', 0.9206674259718757),
 ('today name', 0.9180759226278514),
 ('located northern', 0.9128486117357564),
 ('century AD', 0.9076788577980126),
 ('constellations today', 0.9074807132821405),
 ('Greek mythology', 0.9070429731817469),
 ('modern constellations', 0.9052405538007072),
 ('2nd century', 0.9047878368022942),
 ('remains one', 0.9045723024346024),
 ('one 88', 0.9045679235491217),
 ('AD astronomer', 0.900004664604919),
 ('astronomer Ptolemy', 0.8996892

In [None]:
tp = ke.get_paragraph(target_paragraph)
tp

'Gemini is one of the constellations of the zodiac and is located in the northern celestial hemisphere.  It was one of the 48 constellations described by the 2nd century AD astronomer Ptolemy, and it remains one of the 88 modern constellations today. Its name is Latin for twins, and it is associated with the twins Castor and Pollux in Greek mythology. Its old astronomical symbol is  (♊︎).'

# Load well-trained model from Google Drive.

In [None]:
model_home = !MODEL_HOME="QG-t5" && mkdir -p /content/drive/MyDrive/$MODEL_HOME && echo $MODEL_HOME
model_home = model_home[0]
model_home

'QG-t5'

In [None]:
ckpt_epoch = 1
ckpt_batch = 0
model_mark = 'hw'
model_path = '{}/{}/{}-{}-{}.ckpt'.format(mount_dir, model_home, model_mark, ckpt_epoch, ckpt_batch)

base_model = "t5-base"
tkn = T5Tokenizer.from_pretrained(base_model)
tkn.sep_token = '<sep>'
tkn.add_tokens(['<sep>'])

if os.path.exists(model_path):
  print('Load existed model: {}'.format(model_path))
  b2b = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
else:
  print('Create new model')
  b2b = T5ForConditionalGeneration.from_pretrained(base_model).to(device)
  b2b.resize_token_embeddings(len(tkn))

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Load existed model: /content/drive/MyDrive/QG-t5/hw-1-0.ckpt


# Tokenize the (answer, context) pairs

In [None]:
dec_max_length = 64
enc_max_length = 512

ctx_dict = tkn.batch_encode_plus(
    ['answer: %s <sep> context: %s' % (
      kw,
      tp,
    ) for kw, _ in kws],
    add_special_tokens=True,
    max_length=enc_max_length,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt',
)



# Ready to generate questions
**.eval()** would prevent the gradient and parameters be changed by the input data, it is useful when we use models to do inference, predict, generate jobs.

In [None]:
b2beval = b2b.eval()

In [None]:
out = b2beval.generate(
    input_ids=ctx_dict.input_ids.to(device),
    attention_mask=ctx_dict.attention_mask.to(device),
    num_beams=4,
    length_penalty=1.5,
    no_repeat_ngram_size=3,
    early_stopping=True,
    max_length=dec_max_length
)

In [None]:
ques_prefix = '<pad> question: '
qprefix_len = len(ques_prefix)

raw_ques = [tkn.decode(ln) for ln in out]
out_ques = [q[qprefix_len: q.index('</s>')] for q in raw_ques]

In [None]:
import pandas as pd
qa_df = pd.DataFrame([{'answer': kwtu[0], 'question': q} for kwtu, q in zip(kws, out_ques)])
qa_df

Unnamed: 0,answer,question
0,northern celestial,Where is Gemini located?
1,Pollux Greek,What Greek mythology is Gemini associated with?
2,celestial hemisphere,Where is Gemini located?
3,Castor Pollux,Who is Gemini associated with in Greek mythology?
4,twins Castor,What is Gemini associated with in Greek mythol...
5,hemisphere one,Where is Gemini located?
6,one 48,How many constellations was Gemini described b...
7,associated twins,What is Gemini's name Latin for?
8,twins associated,What is Gemini's name Latin for?
9,48 constellations,What was Gemini described by the 2nd century A...
