In [1]:
import json
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
import torch

def preprocess_dataset(data):
    processed_data = []
    for question_id, details in data.items():
        explanation = details["explanation"]
        question = details['question']
        options = [details[option] for option in details if option.startswith('option')]
        answer = details['answer']

        target = f"question: {question} options: {','.join(options)} answer: {answer}"

        processed_data.append({'explanation': explanation, 'target': target})
    return pd.DataFrame(processed_data)

with open('TeleQnA.json', 'r') as f:
    data = json.load(f)

df = preprocess_dataset(data)
print(df.head())




                                         explanation  \
0  The Nmfaf_3daDataManagement_Deconfigure servic...   
1  The SCMA scheme utilizes the low-complexity me...   
2  The Alamouti scheme provides a diversity gain ...   
3  All devices that have been assigned a short ad...   
4  A supporting UE in a shared network attaches t...   

                                              target  
0  question: What is the purpose of the Nmfaf_3da...  
1  question: Which non-orthogonal multiple access...  
2  question: What is the diversity gain for the d...  
3  question: When are devices required to send th...  
4  question: How does a supporting UE attach to t...  


In [2]:
# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
device = "cuda" if torch.cuda.is_available() else "cpu"

def convert_to_features(row):
    inputs = tokenizer(row['explanation'], truncation=True, padding='max_length', max_length=1024)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(row['target'], truncation=True, padding='max_length', max_length=512)
    inputs['labels'] = labels['input_ids']
    return inputs

# Apply the convert_to_features function to the dataframe
tokenized_data = df.apply(convert_to_features, axis=1)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]



In [3]:
# Combine tokenized data into a single dataset
def collate_batch(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

tokenized_data = tokenized_data.to_list()
train_data, test_data = train_test_split(tokenized_data, test_size=0.2, random_state=42)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='multichoice-question-generator',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)


In [7]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2218,0.191012
2,0.1913,0.181052
3,0.1727,0.178714


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=3000, training_loss=0.5115338795979818, metrics={'train_runtime': 3500.5777, 'train_samples_per_second': 6.856, 'train_steps_per_second': 0.857, 'total_flos': 5.2010510450688e+16, 'train_loss': 0.5115338795979818, 'epoch': 3.0})

In [15]:
# tokenize input text
def tokenize_input(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding='max_length', max_length=1024)
    return inputs.input_ids.to(device), inputs.attention_mask.to(device)

# generate output from the model
def generate_output(input_text):
    input_ids, attention_mask = tokenize_input(input_text)
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# get user input and generate a response
def get_response():
    user_input = input("Enter your text/paragraph: ")
    # user_input= "Examples of factual texts are news reports, interviews, recipes, records of history, instructions, FAQs, etc"
    response = generate_output(user_input)
    print("Generated Output:", response)

get_response()

Generated Output: question: What are some examples of factual texts? options: News reports, interviews, recipes, records of history, instructions, FAQs, etc,All of the above answer: option 4: Everything
