In [None]:
!pip install transformers==4.2.2
!pip install datasets

In [2]:
from datasets import load_dataset
data = load_dataset('EgorShibaev/multi-choice-questions', split='train')

Downloading and preparing dataset csv/EgorShibaev--multi-choice-questions to /root/.cache/huggingface/datasets/EgorShibaev___csv/EgorShibaev--multi-choice-questions-c29a901bc9edc0d7/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/378k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/EgorShibaev___csv/EgorShibaev--multi-choice-questions-c29a901bc9edc0d7/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


In [3]:
data[1025]

{'question': 'What does OOP stand for in programming?',
 'possible answers': 'A) Object-Oriented Programming, B) Object-Oriented Protocol, C) Object-Oriented Procedures, D) Object-Oriented Performance',
 'Right answer': 'A) Object-Oriented Programming',
 'topic': 'programming'}

In [4]:
# this function generate one string from data entry
def to_one_string(row):
  topic = row['topic']
  question = row['question']
  ans = row['possible answers']
  right = row['Right answer']
  line = f'TOPIC: {topic}; QUESTION: {question}; POSSIBLE ANSWERS: {ans}; RIGHT ANSWER: {right}'
  return {'text': line}

data = data.map(to_one_string, remove_columns=data.column_names)

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

In [5]:
data[1025]

{'text': 'TOPIC: programming; QUESTION: What does OOP stand for in programming?; POSSIBLE ANSWERS: A) Object-Oriented Programming, B) Object-Oriented Protocol, C) Object-Oriented Procedures, D) Object-Oriented Performance; RIGHT ANSWER: A) Object-Oriented Programming'}

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]

Embedding(50257, 768)

In [7]:
data = data.map(
    lambda entry: tokenizer(entry['text']), 
    batched=True, 
    remove_columns=data.column_names
)

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

In [8]:
import numpy as np

def concat(entries):
  def split(arr, length):
    return [arr[i:i + length] for i in range(0, len(arr), length)][:-1]

  keys = list(entries.keys())
  concat_entries = {key: sum(entries[key], []) for key in keys}

  block_size = 128

  splitted_entries = { key: split(concat_entries[key], block_size) for key in keys }
  splitted_entries["labels"] = splitted_entries["input_ids"].copy()
  return splitted_entries

data = data.map(concat, batched=True).train_test_split(test_size=0.2)

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

In [9]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

training_args = TrainingArguments(
    output_dir="./mutli-choice",
    num_train_epochs=30, 
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=50,
    warmup_steps=500,
    prediction_loss_only=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
)

In [10]:
trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
50,No log,2.531331,1.6728,130.917
100,2.654800,1.697882,1.6249,134.779
150,2.654800,1.354194,1.6638,131.625
200,1.467700,1.197656,1.6429,133.297
250,1.467700,1.109885,1.6439,133.222
300,1.107300,1.056791,1.6528,132.505
350,1.107300,1.018325,1.6617,131.797
400,0.937300,0.997122,1.6433,133.267
450,0.937300,0.977844,1.6539,132.417
500,0.820800,0.963315,1.6365,133.822


TrainOutput(global_step=840, training_loss=1.0983660311925978, metrics={'train_runtime': 679.0572, 'train_samples_per_second': 1.237, 'total_flos': 1647583000657920, 'epoch': 30.0})

In [11]:
trainer.save_model()

In [12]:
from transformers import pipeline

trained_model = pipeline('text-generation',model='./mutli-choice', tokenizer=tokenizer, config={'max_length':800})

In [13]:
def get_question_by_topic(topic):
  text = trained_model(f'TOPIC: {topic};', max_length=100)[0]['generated_text']
  text = text.split(f'TOPIC: {topic}; ')[1]
  return text

In [14]:
get_question_by_topic("sport")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'QUESTION: Who won the 2019 NBA Championship?; POSSIBLE ANSWERS: a) Milwaukee Bucks, b) Los Angeles Lakers, c) Golden State Warriors, d) Brooklyn Nets; RIGHT ANSWER: a) Milwaukee Bucks'

In [15]:
get_question_by_topic("medicine")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'QUESTION: What is the common route for surgical removal of a colon?; POSSIBLE ANSWERS: A) Surgery, B) Surgery, C) Doprofen, D) Analgesics; RIGHT ANSWER: B) Surgery'

In [17]:
get_question_by_topic("programming")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'QUESTION: What programming language was developed by Larry Summers, Jr., commonly known as "The Computer Science of the Year" by some as well as others?; POSSIBLE ANSWERS: a) Java, b) Python, c) Swift, d) JavaScript; RIGHT ANSWER: c) Swift'