In [None]:
##Generate Instruction for fine-tuning using gpt4o from openai API

In [None]:
from openai import OpenAI
import os
import concurrent.futures
import json
import random
import re
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
from openai import OpenAI
from tqdm.auto import tqdm
from datasets import Dataset

os.environ["OPENAI_API_KEY"] = ""

In [None]:
%env OPENAI_API_KEY=""

In [13]:
class InstructionAnswerSet:
    def __init__(self, pairs: List[Tuple[str, str]]):
        self.pairs = pairs
    @classmethod
    def from_json(cls, json_str: str) -> 'InstructionAnswerSet':
        data = json.loads(json_str)
        pairs = [(pair['instruction'], pair['answer'])
                 for pair in data['instruction_answer_pairs']]
        return cls(pairs)
    def __iter__(self):
        return iter(self.pairs)

In [12]:
def generate_instruction_answer_pairs(extract):
    prompt = f"""Based on the following extract, generate two instruction-answer pairs. Each instruction \
must ask to write about a specific topic contained in the context. each answer \
must provide a relevant paragraph based on the information found in the \
context. Only use concepts from the context to generate the instructions. \
Instructions must never explicitly mention a context, a system, a course, or an extract. \
Instructions must be self-contained and general. \
Answers must imitate the writing style of the context. \
Example instruction: Introduce ROS2 library. \
Example answer:The Robot Operating System (ROS) is a set of software libraries and tools for building robot applications. \
From drivers and state-of-the-art algorithms to powerful developer tools, ROS has the open source tools you need for your next robotics project \
Since ROS was started in 2007, a lot has changed in the robotics and ROS community.  \
The goal of the ROS 2 project is to adapt to these changes, leveraging what is great about ROS 1 and improving what isn’t. \
Provide your response in JSON format with the following structure:
{{
    "instruction_answer_pairs": [
        {{"instruction": "...", "answer": "..."}},
        ...
    ]
}}
Extract:
{extract}
"""
    client = OpenAI()
    # 使用 OpenAI API 请求生成回答
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", "content": "You are a helpful assistant who \
            generates instruction-answer pairs based on the given context. \
            Provide your response in JSON format.",
            },
            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
        max_tokens=1200,
        temperature=0.7,
    )

    return completion.choices[0].message


In [9]:
result = generate_instruction_answer_pairs("The Robot Operating System (ROS) is a set of software libraries and tools for building robot applications. From drivers and state-of-the-art algorithms to powerful developer tools, ROS has the open source tools you need for your next robotics project.")

In [11]:
temp = InstructionAnswerSet.from_json(result.content)

In [12]:
temp.pairs

[('Explain the purpose of ROS.',
  'The Robot Operating System (ROS) serves as a comprehensive framework designed to facilitate the development of robot applications. It provides a robust collection of software libraries and tools that enable developers to integrate various functionalities, from hardware drivers to advanced algorithms. With its open-source nature, ROS empowers the robotics community by offering resources that can be utilized for a wide array of projects, ensuring that developers have access to the essential tools needed to bring their robotic innovations to life.'),
 ('Discuss the evolution of ROS.',
  'Since its inception in 2007, the Robot Operating System has undergone significant evolution, adapting to the rapid advancements in robotics technology and the growing demands of the community. The transition to ROS 2 marks a pivotal moment in this evolution, as it aims to build upon the strengths of ROS 1 while addressing its limitations. This ongoing development reflec

In [None]:
import pymongo
from pymongo import MongoClient
from datasets import Dataset
import json

# MongoDB
mongo_client = MongoClient("mongodb://localhost:27018/")
mongo_db = mongo_client["ros2_database"]
mongo_collection = mongo_db["ros2_documents"]



def get_extract_from_mongodb(query,collection):
    return collection.find(query)

In [16]:
documents = list(mongo_collection.find())
len(documents)

1985

In [None]:
instruction_answer_pairs=[]
i = 0
for document in documents:
    try:
        temp = generate_instruction_answer_pairs(document['content'])
        instruction_answer_pairs.extend(InstructionAnswerSet.from_json(temp.content).pairs)
    except Exception:
        continue ## sometimes gpt4 went up with ill-formed answers


In [34]:
instructions, answers = zip(*instruction_answer_pairs)

In [35]:
filtered_dataset= Dataset.from_dict(
        {"instruction": list(instructions), "output": list(answers)}
    )
filtered_dataset = filtered_dataset.train_test_split(test_size=0.1)

In [None]:
from huggingface_hub import login

#hugging face login
login(token="")

In [37]:
filtered_dataset.push_to_hub("1312354o/llama-ros2")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/407 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/1312354o/llama-ros2/commit/7c22e6a35481c01f5debaca7bf3ae0c387968f1f', commit_message='Upload dataset', commit_description='', oid='7c22e6a35481c01f5debaca7bf3ae0c387968f1f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/1312354o/llama-ros2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='1312354o/llama-ros2'), pr_revision=None, pr_num=None)