#### EPFL course dataset

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch 
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
import json


  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [2]:
# load the dataset from the json file
with open("data/data_exams.json", "r") as file:
    dataset_epfl_exams = json.load(file)

In [3]:
# split the dataset into training, validation and test
train_dataset, val_test_dataset = train_test_split(dataset_epfl_exams, test_size=0.2, random_state=1)
val_dataset, test_dataset = train_test_split(val_test_dataset, test_size=0.5, random_state=1)

In [5]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

print(train_dataset)

Training dataset size: 440
Validation dataset size: 55
Test dataset size: 56
[{'question': ' For the question, mark the box (without erasing) TRUE if the statement is  always true  and the box FALSE if it is  not always true  (i.e., it is sometimes false). (Convex I)  Unions of convex sets are convex. TRUE FALSE', 'answer': '  False. Look at a line and take two non-intersecting closed intervals. Each of them is convex but the union is not.', 'course': 'ML'}, {'question': 'A particle of mass  m  moves in one dimension. Its potential energy is given by U ( x ) =  − U 0 e − x 2 /a 2 , where  U 0  and  a  are positive constants. 1. Draw an energy diagram showing the potential energy  U ( x ) , the kinetic energy  K ( x ) , and the total energy  E <  0  for the motion of a particle that is trapped between two turning points at  x  =  ± a . 2. Find the force  F ( x )  on the particle as a function of position  x . 3. Find the particle’s speed at the origin  x  = 0  such that, when it reaches

In [7]:
# save the datasets
with open("data_fine_tuning/epfl_train_dataset.json", "w") as file:
    json.dump(train_dataset, file)

with open("data_fine_tuning/epfl_val_dataset.json", "w") as file:
    json.dump(val_dataset, file)

with open("data_fine_tuning/epfl_test_dataset.json", "w") as file:
    json.dump(test_dataset, file)


#### STEM dataset

In [8]:
dataset_stem = []
with open("project-code-2024/datasets/sft_stemQ_train.jsonl", "r") as file:
    for line in file:
        dataset_stem.append(json.loads(line))

In [9]:
# split the dataset into training, validation and test
train_dataset, val_test_dataset = train_test_split(dataset_stem, test_size=0.2, random_state=1)
val_dataset, test_dataset = train_test_split(val_test_dataset, test_size=0.5, random_state=1)

In [10]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

print(train_dataset)

Training dataset size: 352
Validation dataset size: 44
Test dataset size: 45
[{'question': 'Calculate the updated theta after one gradient descent step if theta is 3 , eta is 0.05 , and the loss function is ( 1 * theta + 3 ) ^ 2 .', 'answer': '2.4', 'subject': 'Introduction to Machine Learning'}, {'question': 'A particular medical operation proves fatal in 1% of the cases. Find an approximation to the probability that there will be at least 2 fatalities in 200 operations.', 'answer': '0.594', 'subject': 'Introduction to Probability'}, {'question': 'There are 3 coins in a box. One is a two-headed coin, another is a fair coin, and the third is a biased coin that comes up heads 75 percent of the time. When one of the 3 coins is selected at random and flipped, it shows heads. What is the probability that it was the two-headed coin?', 'answer': '4/9', 'subject': 'Probability and Random Variables'}, {'question': 'An object weighs 10 pounds, so the force of gravity on the object is represente

In [11]:
# save the datasets
with open("data_fine_tuning/stem_train_dataset.json", "w") as file:
    json.dump(train_dataset, file)

with open("data_fine_tuning/stem_val_dataset.json", "w") as file:
    json.dump(val_dataset, file)

with open("data_fine_tuning/stem_test_dataset.json", "w") as file:
    json.dump(test_dataset, file)
