# Generate question and answer

In [1]:
# set up
from openai import OpenAI
from google.colab import userdata

deepseek_token = userdata.get('Deepseek')

client = OpenAI(
  base_url = "https://api.deepseek.com",
  api_key = deepseek_token
)

In [15]:
# generate question

subtopic_list = ["Age-Based Workouts","Strength and Bodybuilding", "Cardio and Weight Loss", "Comprehensive Training", "Problems of specific gym exercises"]

question_prompt_template = """/
The objective is to create a set of synthetic data about personal condition and the corresponding personalised workout plans
Given a topic, generate {num_response} possible questions that could be given to an AI assitant about that topic.
Each quesiton should be written as if by someone who has limited knowledge and experience with workout, and should contain personal health-related information such as age, weight, their goals, etc.
Make sure the question types are diverse. Do not keep using the same question format.

The topic is {sub_topic}
The list must be without numbers. The questions/instructions should be separated by a newline character. There must be no other text than the list.

"""

num_response = 20
def generate_question(client,sub_topic, num_response):
  prompt = question_prompt_template.format(sub_topic=sub_topic, num_response=num_response) # format: replace place holders
  response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role":"user",
               "content":prompt}],
    temperature=1.2,
    top_p=0.7,
    max_tokens=1024,
    stream=False
  )
  return response.choices[0].message.content

def generate_question_list(client,subtopic_list, num_response):
  question_list = [generate_question(client,sub_topic, num_response) for sub_topic in subtopic_list] # list comprehension
  return question_list

question_list = generate_question_list(client,subtopic_list, num_response)

# format question list
question_list_formatted = []
for question_set in question_list:
  question_list_formatted.extend([q.strip() for q in question_set.split("\n") if q])
print(question_list_formatted)

["I'm 45 and overweight, what kind of workouts should I do to lose weight safely?", 'My 60-year-old mom wants to stay active but has joint pain—what exercises are best for her?', "I'm a 30-year-old beginner with no gym experience—where should I start with age-appropriate workouts?", 'Can you suggest a workout plan for a 50-year-old man with high blood pressure?', "I'm 25 and underweight—how can I build muscle without overexerting myself?", 'What are safe exercises for a 70-year-old who’s recovering from a hip replacement?', 'I’m 35 and have a sedentary job—what workouts can help me stay fit without taking too much time?', 'My 16-year-old son is into sports—what strength training is safe for his age?', 'I’m 55 and have arthritis—are there low-impact workouts that won’t hurt my joints?', 'A 40-year-old friend wants to run a marathon but has never exercised—how should they prepare?', 'What kind of workouts should an 18-year-old do to improve overall fitness without getting injured?', 'I’m

In [9]:
# generate answer

answer_prompt_template = """/
The objective is to create a set of synthetic data about personal health condition and the corresponding personalised workout plans
Given a question, generate a possible answer that could be given.
The answer should be concise, including how it is personalised for the question, why certain exercises are recommended, and tips for doing these exercises effectively

The question is: {question}
"""

# num_response = 5
def generate_answer(client, question):
  prompt = answer_prompt_template.format(question=question) # format: replace place holders
  response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role":"user",
               "content":prompt}],
    temperature=1.0,
    top_p=0.7,
    max_tokens=1024,
    stream=False
  )
  return response.choices[0].message.content

def generate_answer_list(client, question_list_formatted):
  answer_list = [generate_answer(client,question) for question in question_list_formatted] # list comprehension
  return answer_list

answer_list = generate_answer_list(client, question_list_formatted)
QA_pair_list = []
for question, answer in zip(question_list_formatted, answer_list):
  QA_pair_list.append(
      {
          "question": question,
          "answer": answer,
      }
      )

print(QA_pair_list)


KeyboardInterrupt: 

In [17]:
# optional: generate answer with CoT

answer_prompt_template = """/
The objective is to create a set of synthetic data about personal health condition and the corresponding personalised workout plans
Given a question asking for advice on workout planes, answer the question in two parts.
The first part is the rationale, breaking down the question into different aspects and steps.
The second part is the answer, which should be concise, including how it is personalised for the question, why certain exercises are recommended, and tips for doing these exercises effectively.

An example is given below, but you don't have to follow the exact same pattern.
You must include the 'Rationale' and the 'Answer' headings.
Question:
"I'm 55 years old and have never worked out before, but I want to start exercising to improve my heart health. What kind of workouts should I do that are safe for my age?"

Rationale:
Understand User Context:
Age: 35 (metabolism starts slowing, joint care becomes important).
Lifestyle: Sedentary desk job (risk of muscle atrophy, poor posture).
Goal: Lose 10 lbs (prioritize fat loss via calorie deficit + activity).
Key Requirements:
Cardio: Burns calories efficiently (target 150+ mins/week).
Strength Training: Preserves muscle mass (boosts metabolism).
Low-Impact Options: Protect joints (e.g., cycling vs running).
Time Efficiency: Short, intense workouts fit busy schedules.
Personalization Levers:
Address desk-job effects (e.g., core exercises for posture).
Avoid high-injury-risk moves (e.g., heavy lifts without form training).

Answer:
Cardio: 3x/week (30 mins cycling/swimming @ 60-70% max heart rate).
Strength: 2x/week (full-body circuits: squats, push-ups, rows – 3 sets x 12 reps).
Mobility: Daily 5-min desk stretches (neck rolls, seated twists).
Why These Exercises?
Cycling/Swimming: Low-impact, burn 250-400 kcal/session.
Bodyweight Circuits: Build muscle without gym equipment; squats combat sitting-induced glute weakness.
Stretching: Relieves back/shoulder tension from desk posture.
Pro Tips:
Nutrition: Pair with ~500 kcal/day deficit (e.g., swap soda for water).
Form First: Start with knee push-ups if full ones are hard.
Track Progress: Weigh weekly + measure waist (fat loss > scale fluctuations).
Personalization:
Adjust cardio intensity if joints ache (try elliptical instead).
Add 10-min walk post-lunch to boost NEAT (non-exercise activity thermogenesis).


The question is: {question}
"""

def generate_answer(client, question):
  prompt = answer_prompt_template.format(question=question) # format: replace place holders
  response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role":"user",
               "content":prompt}],
    temperature=1.1,
    top_p=1.0,
    max_tokens=1024,
    stream=False
  )
  return response.choices[0].message.content

def generate_answer_list(client, question_list_formatted):
  answer_list = [generate_answer(client,question) for question in question_list_formatted] # list comprehension
  return answer_list

answer_list = generate_answer_list(client, question_list_formatted)
QA_pair_list = []
for question, answer in zip(question_list_formatted, answer_list):
  QA_pair_list.append(
      {
          "question": question,
          "answer": answer,
      }
      )

print(QA_pair_list)



In [21]:
# save json

import json
with open('synthetic_data.jsonl', 'w') as f:
    for item in QA_pair_list:
        f.write(json.dumps(item))
        f.write('\n')

# Fliter data with Nemotron 4

In [None]:
# read data
import json
def read_synthetic_data(file_path):
    data_list = []
    with open(file_path, 'r') as file:
        for line in file:
            entry = json.loads(line)
            one_line_dict = {}
            one_line_dict["question"] = entry['question']
            one_line_dict["answer"] = entry['answer']
            data_list.append(one_line_dict)
    return data_list

synthetic_data = read_synthetic_data("synthetic_data.jsonl")
len(synthetic_data)



147

In [None]:
# Fliter data with Nemotron 4

def get_scores_from_response(score_response_template):
    logprobs = score_response_template.choices[0].logprobs.content
    score_dict = {}
    for score in logprobs:
        score_dict[score.token] = score.logprob
    return score_dict

def get_response_and_scores(client, model, question, response_content):
    messages = [
        {
            "role": "user",
            "content": question
        },
        {
            "role": "assistant",
            "content": response_content
        },
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
    )

    scores = get_scores_from_response(response)
    return scores

def process_QA_score(client, model, synthetic_data):
    score_list = []
    for QA_pair in synthetic_data:
        question = QA_pair["question"]
        answer = QA_pair["answer"]
        score = get_response_and_scores(client, model, question, answer)
        score_list.append(score)
    return score_list

In [None]:
score_list = process_QA_score(client, "nvidia/nemotron-4-340b-reward", synthetic_data)

print(score_list)





[{'helpfulness': 3.40625, 'correctness': 3.4375, 'coherence': 3.90625, 'complexity': 1.7890625, 'verbosity': 2.21875}, {'helpfulness': 3.140625, 'correctness': 3.1875, 'coherence': 3.65625, 'complexity': 1.8203125, 'verbosity': 2.34375}, {'helpfulness': 3.53125, 'correctness': 3.5, 'coherence': 3.78125, 'complexity': 2.015625, 'verbosity': 2.328125}, {'helpfulness': 3.46875, 'correctness': 3.515625, 'coherence': 3.765625, 'complexity': 1.90625, 'verbosity': 2.390625}, {'helpfulness': 3.4375, 'correctness': 3.484375, 'coherence': 3.875, 'complexity': 1.8046875, 'verbosity': 2.265625}, {'helpfulness': 3.46875, 'correctness': 3.375, 'coherence': 3.734375, 'complexity': 1.84375, 'verbosity': 2.328125}, {'helpfulness': 2.859375, 'correctness': 2.875, 'coherence': 3.546875, 'complexity': 1.703125, 'verbosity': 2.171875}, {'helpfulness': 3.25, 'correctness': 3.390625, 'coherence': 3.75, 'complexity': 1.71875, 'verbosity': 2.25}, {'helpfulness': 3.46875, 'correctness': 3.546875, 'coherence': 3

In [None]:
helpfulness_sum = 0
correctness_sum = 0
coherence_sum = 0
complexity_sum = 0
verbosity_sum = 0

sums = {key: 0 for key in score_list[0].keys()}

for i in score_list:
  helpfulness_sum = helpfulness_sum + i["helpfulness"]
  correctness_sum = correctness_sum + i["correctness"]
  coherence_sum = coherence_sum + i["coherence"]
  complexity_sum = complexity_sum + i["complexity"]
  verbosity_sum = verbosity_sum + i["verbosity"]


avg_score = helpfulness_sum / len(score_list)
print(helpfulness_avg)

3.323767006802721


In [None]:
sums = {key: 0 for key in score_list[0].keys()}

for entry in score_list:
  for key, value in entry.items():
    sums[key] += value

avgs = {key: value / len(score_list) for key, value in sums.items()}
print(avgs)


{'helpfulness': 3.323767006802721, 'correctness': 3.3734587585034013, 'coherence': 3.7642431972789114, 'complexity': 1.8119685374149659, 'verbosity': 2.2825520833333335}


In [None]:

# filter
helpfulness_THRESHOLD = 3
verbosity_THRESHOLD = 3
synthetic_data_filtered = [data for i, data in enumerate(synthetic_data)
                  if not (score_list[i]["helpfulness"] < helpfulness_THRESHOLD or
                          score_list[i]["verbosity"] > verbosity_THRESHOLD)]
len(synthetic_data_filtered)

135

# Push data to Hugging Face

In [22]:
# Log in to Hugging Face
!pip install -q huggingface_hub
!pip install -q datasets
from google.colab import userdata
from huggingface_hub import login
from datasets import Dataset, DatasetDict, load_dataset
my_hugging_face_token = userdata.get('huggingface_token')
login(token=my_hugging_face_token)

In [23]:
with open(f'synthetic_data.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]
dataset = Dataset.from_list(data)
dataset_dict = DatasetDict({"train": dataset})
dataset_dict.push_to_hub("Jiexing1028/workout-plan", commit_message = "new data")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/313 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Jiexing1028/workout-plan/commit/9e9443bcab4a750add9474bcffa77cde408152bf', commit_message='new data', commit_description='', oid='9e9443bcab4a750add9474bcffa77cde408152bf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Jiexing1028/workout-plan', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Jiexing1028/workout-plan'), pr_revision=None, pr_num=None)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/309 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/Jiexing1028/workout-plan/commit/9e9443bcab4a750add9474bcffa77cde408152bf', commit_message='new data', commit_description='', oid='9e9443bcab4a750add9474bcffa77cde408152bf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Jiexing1028/workout-plan', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Jiexing1028/workout-plan'), pr_revision=None, pr_num=None)

In [24]:
# Optional: add to the existing dataset

dataset1 = load_dataset("Jiexing1028/workout-plan", revision="9e9443bcab4a750add9474bcffa77cde408152bf")
dataset2 = load_dataset("Jiexing1028/workout-plan", revision="b021ddce0baf4cda201eb58de4620181fce4f574")
from datasets import concatenate_datasets

merged_dataset = concatenate_datasets([dataset1['train'], dataset2['train']])
print(len(merged_dataset))
#dataset = Dataset.from_list(data)
dataset_dict = DatasetDict({"train": merged_dataset})
dataset_dict.push_to_hub("Jiexing1028/workout-plan", commit_message = "merged new data")



train-00000-of-00001.parquet:   0%|          | 0.00/140k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1065 [00:00<?, ? examples/s]

1165


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Jiexing1028/workout-plan/commit/44493085cbb91f00df3a52fd8f1a9edb092c7185', commit_message='merged new data', commit_description='', oid='44493085cbb91f00df3a52fd8f1a9edb092c7185', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Jiexing1028/workout-plan', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Jiexing1028/workout-plan'), pr_revision=None, pr_num=None)

In [None]:
# download dataset to local
dataset1 = load_dataset("Jiexing1028/workout-plan", revision="69d1508a55ff7e9e8705adb3ca9aacc7134fa096")
print(dataset1['train'])
temp = dataset1['train']
import json
with open('synthetic_data.jsonl', 'w') as f:
    for item in temp:
        f.write(json.dumps(item))
        f.write('\n')

README.md:   0%|          | 0.00/312 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/709 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 709
})
