In [None]:
# %pip install transformers
# %pip install tf-keras
# %pip install sentencepiece
# %pip install tensorflow_text
# %pip install accelerate
# %pip install tensorboard
# %pip install tensorflow-intel
# %pip install --upgrade protobuf
# %pip uninstall protobuf keras tensorboard
# %pip uninstall tensorflow tensorflow-intel tensorflow-gpu
# %pip install protobuf==3.20.3
%pip install keras==3.5.0
%pip install tensorboard==2.18.0
%pip install --upgrade onnx
%pip install tensorflow tensorflow-intel

In [None]:
%pip install tensorflo2w==2.10.0
%pip install transformers==4.24.0

In [1]:
import tqdm as notebook_tqdm
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the data
input_file = '../placement-questions-excel.csv'
data = pd.read_csv(input_file)
data.head()

Unnamed: 0,Degree,Role,Section,Proficiency Level,Question,Options,Correct Answer,Explanation
0,B.Tech in Computer Science,Data Analyst,Computational Skills,Beginner,What is the primary purpose of the pandas grou...,"['To sort data', 'To split data into groups', ...",To split data into groups,The groupby() function splits the data into gr...
1,B.Tech in Computer Science,Data Analyst,Core Programming,Medium,Which time complexity represents binary search?,"['O(n)', 'O(log n)', 'O(n log n)', 'O(1)']",O(log n),Binary search repeatedly divides the search sp...
2,B.Tech in Computer Science,Data Analyst,Data Analysis,Advanced,In a dataset with outliers which visualization...,"['Simple line plot', 'Box plot with whiskers',...",Box plot with whiskers,"Box plots show median, quartiles, and outliers..."
3,B.Sc. in Mathematics,Risk Analyst,Core Mathematical Subjects,Beginner,What is the variance of a constant?,"['1', 'The constant value', '0', 'Undefined']",0,The variance measures spread around the mean. ...
4,B.Sc. in Mathematics,Risk Analyst,Applied Mathematics,Medium,In Value at Risk (VaR) calculation what confid...,"['90%', '95%', '99%', '99.9%']",99%,99% is the standard confidence level for VaR i...


In [3]:
# Prepare the data for fine-tuning
questions = data['Question'].tolist()
sections = data['Section'].tolist()  # Assuming a 'Section' column exists to classify questions.

In [4]:
questions

['What is the primary purpose of the pandas groupby() function in Python?',
 'Which time complexity represents binary search?',
 'In a dataset with outliers which visualization technique would be most appropriate for understanding the distribution?',
 'What is the variance of a constant?',
 'In Value at Risk (VaR) calculation what confidence level is typically used in financial risk management?',
 'Which statistical test would you use to compare the means of three or more independent groups?',
 'What is the primary difference between simple and compound interest?',
 'Which investment strategy typically provides the highest potential return over a long-term period?',
 'How does duration measure bond price sensitivity to interest rate changes?',
 'Which plot type is most suitable for showing the relationship between two continuous variables?',
 'What is the difference between INNER JOIN and LEFT JOIN?',
 'What is the time complexity of inserting an element into a Python list at the begin

In [5]:
# Create a dataset with prompts and targets
def prepare_data(questions, sections):
    data_dict = {'prompt': [], 'target': []}
    for question, section in zip(questions, sections):
        context = f"Generate questions for the section: {section}. Base question: {question}"
        for i in range(4):  # Generate 4x questions for each input question
            data_dict['prompt'].append(context)
            data_dict['target'].append("<new_question>")  # Placeholder for new questions during training
    return data_dict

# Prepare the dataset for Hugging Face
prepared_data = prepare_data(questions, sections)
hf_dataset = Dataset.from_dict(prepared_data)
hf_dataset

Dataset({
    features: ['prompt', 'target'],
    num_rows: 180
})

In [6]:
# Load the T5 tokenizer and model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['prompt'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['target'], max_length=128, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 180/180 [00:00<00:00, 2383.84 examples/s]


Dataset({
    features: ['prompt', 'target', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 180
})

In [8]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./t5-fine-tuned",  
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./t5-fine-tuned")

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, prompt. If target, prompt are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 180
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 135
  Number of trainable parameters = 222903552
  7%|▋         | 10/135 [01:23<16:06,  7.73s/it]

{'loss': 10.9154, 'learning_rate': 4.62962962962963e-05, 'epoch': 0.22}


 15%|█▍        | 20/135 [02:41<14:31,  7.58s/it]

{'loss': 1.7477, 'learning_rate': 4.259259259259259e-05, 'epoch': 0.44}


 22%|██▏       | 30/135 [03:58<13:18,  7.61s/it]

{'loss': 0.4491, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.67}


 30%|██▉       | 40/135 [05:19<12:59,  8.21s/it]

{'loss': 0.2329, 'learning_rate': 3.518518518518519e-05, 'epoch': 0.89}


 33%|███▎      | 45/135 [06:01<12:21,  8.24s/it]The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, prompt. If target, prompt are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 180
  Batch size = 8
                                                
 33%|███▎      | 45/135 [07:37<12:21,  8.24s/it]Saving model checkpoint to ./t5-fine-tuned\checkpoint-45
Configuration saved in ./t5-fine-tuned\checkpoint-45\config.json


{'eval_loss': 0.09622772783041, 'eval_runtime': 96.332, 'eval_samples_per_second': 1.869, 'eval_steps_per_second': 0.239, 'epoch': 1.0}


Model weights saved in ./t5-fine-tuned\checkpoint-45\pytorch_model.bin
tokenizer config file saved in ./t5-fine-tuned\checkpoint-45\tokenizer_config.json
Special tokens file saved in ./t5-fine-tuned\checkpoint-45\special_tokens_map.json
 37%|███▋      | 50/135 [08:55<25:08, 17.74s/it]  

{'loss': 0.1276, 'learning_rate': 3.148148148148148e-05, 'epoch': 1.11}


 44%|████▍     | 60/135 [10:12<09:49,  7.86s/it]

{'loss': 0.0856, 'learning_rate': 2.777777777777778e-05, 'epoch': 1.33}


 52%|█████▏    | 70/135 [11:28<08:10,  7.55s/it]

{'loss': 0.0654, 'learning_rate': 2.4074074074074074e-05, 'epoch': 1.56}


 59%|█████▉    | 80/135 [12:45<06:53,  7.51s/it]

{'loss': 0.0345, 'learning_rate': 2.037037037037037e-05, 'epoch': 1.78}


 67%|██████▋   | 90/135 [14:01<05:38,  7.53s/it]The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, prompt. If target, prompt are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 180
  Batch size = 8


{'loss': 0.0248, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                                
 67%|██████▋   | 90/135 [15:36<05:38,  7.53s/it]Saving model checkpoint to ./t5-fine-tuned\checkpoint-90
Configuration saved in ./t5-fine-tuned\checkpoint-90\config.json


{'eval_loss': 0.00019901295308955014, 'eval_runtime': 95.5153, 'eval_samples_per_second': 1.885, 'eval_steps_per_second': 0.241, 'epoch': 2.0}


Model weights saved in ./t5-fine-tuned\checkpoint-90\pytorch_model.bin
tokenizer config file saved in ./t5-fine-tuned\checkpoint-90\tokenizer_config.json
Special tokens file saved in ./t5-fine-tuned\checkpoint-90\special_tokens_map.json
 74%|███████▍  | 100/135 [17:48<05:11,  8.91s/it]

{'loss': 0.0176, 'learning_rate': 1.2962962962962962e-05, 'epoch': 2.22}


 81%|████████▏ | 110/135 [18:58<02:54,  6.97s/it]

{'loss': 0.0081, 'learning_rate': 9.259259259259259e-06, 'epoch': 2.44}


 89%|████████▉ | 120/135 [20:08<01:43,  6.92s/it]

{'loss': 0.0061, 'learning_rate': 5.555555555555556e-06, 'epoch': 2.67}


 96%|█████████▋| 130/135 [21:16<00:34,  6.85s/it]

{'loss': 0.01, 'learning_rate': 1.8518518518518519e-06, 'epoch': 2.89}


100%|██████████| 135/135 [21:53<00:00,  7.28s/it]The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, prompt. If target, prompt are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 180
  Batch size = 8
                                                 
100%|██████████| 135/135 [23:19<00:00,  7.28s/it]Saving model checkpoint to ./t5-fine-tuned\checkpoint-135
Configuration saved in ./t5-fine-tuned\checkpoint-135\config.json


{'eval_loss': 0.00010136591299669817, 'eval_runtime': 85.8534, 'eval_samples_per_second': 2.097, 'eval_steps_per_second': 0.268, 'epoch': 3.0}


Model weights saved in ./t5-fine-tuned\checkpoint-135\pytorch_model.bin
tokenizer config file saved in ./t5-fine-tuned\checkpoint-135\tokenizer_config.json
Special tokens file saved in ./t5-fine-tuned\checkpoint-135\special_tokens_map.json
Deleting older checkpoint [t5-fine-tuned\checkpoint-45] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./t5-fine-tuned\checkpoint-135 (score: 0.00010136591299669817).
  state_dict = torch.load(best_model_path, map_location="cpu")
100%|██████████| 135/135 [23:27<00:00, 10.42s/it]
Saving model checkpoint to ./t5-fine-tuned
Configuration saved in ./t5-fine-tuned\config.json


{'train_runtime': 1407.3569, 'train_samples_per_second': 0.384, 'train_steps_per_second': 0.096, 'train_loss': 1.0168263367204755, 'epoch': 3.0}


Model weights saved in ./t5-fine-tuned\pytorch_model.bin
tokenizer config file saved in ./t5-fine-tuned\tokenizer_config.json
Special tokens file saved in ./t5-fine-tuned\special_tokens_map.json


In [9]:
# Generate new questions based on the input
def generate_questions(input_questions, section):
    results = []
    for question in input_questions:
        context = f"Generate questions for the section: {section}. Base question: {question}"
        inputs = tokenizer(context, return_tensors="pt", max_length=512, truncation=True).to(model.device)
        outputs = model.generate(**inputs, max_length=128, num_return_sequences=4, num_beams=4)
        decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        results.extend(decoded_outputs)
    return results

# Example inference
input_questions = questions[:10]  # Test with the first 10 questions
section = "Communication"  # Replace with relevant section
new_questions = generate_questions(input_questions, section)

# Save the results
output_df = pd.DataFrame({
    "Base Question": input_questions * 4,  # Repeat each input question 4 times
    "Generated Question": new_questions
})
output_df.to_csv("../expanded_questions.csv", index=False)