## Data Gen

In [29]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [30]:
from openai import OpenAI

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv("OPENROUTER_API_KEY"),
)

completion = client.chat.completions.create(
  extra_body={
      "include_reasoning": True
  },
  model="deepseek/deepseek-r1",
  messages=[
    {
      "role": "user",
      "content": "How many major languages are in the world?"
    }
  ],
)
print(completion.choices[0].message.content)
print("<think>")
print(completion.choices[0].message.model_extra.get("reasoning"))

The number of major languages in the world is not fixed and depends on the criteria used (e.g., speaker count, regional influence, official status). However, common estimates suggest:

1. **Top 20–30 Languages**: Often identified by **native/total speakers** (e.g., Ethnologue lists **23 languages** with over 50 million native speakers).  
   Examples: Mandarin Chinese, Spanish, English, Hindi, Arabic, Bengali, Portuguese, Russian, Japanese, French, German, Swahili, etc.

2. **UN Official Languages**: 6 languages (Arabic, Chinese, English, French, Russian, Spanish) are designated for global diplomacy.

3. **Regional Significance**: Languages like Swahili (Africa) or Indonesian (Southeast Asia) are major in specific regions, even with fewer global speakers.

4. **Total Speakers**: Including second-language users, languages like English (~1.5 billion total speakers) and French (~300 million) rank highly.

**Key Factors**:  
- **Native speakers**: Mandarin Chinese leads with ~1 billion.  


In [18]:
import os
import re
import json
#import openai
import pandas as pd
import tiktoken

# ---------------------------
# Configuration & API Setup
# ---------------------------

# Set your API key (if using OpenAI’s client for deepseek/deepseek-r1)
#openai.api_key = "YOUR_API_KEY"

# Choose the encoding – here we use a model’s encoding (adjust if needed)
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")  # or any model you prefer

# Maximum token limit per chunk and overlap tokens for sliding window
MAX_TOKENS = 1500
OVERLAP_TOKENS = 150

# ---------------------------
# Helper Functions
# ---------------------------

def count_tokens(text: str) -> int:
    """Count the tokens in text using tiktoken."""
    return len(encoding.encode(text))

def split_text_sliding_window(text: str, max_tokens: int = MAX_TOKENS, overlap: int = OVERLAP_TOKENS) -> list:
    """
    Splits text into multiple overlapping chunks if it exceeds max_tokens.
    Returns a list of text chunks.
    """
    tokens = encoding.encode(text)
    total = len(tokens)
    chunks = []
    start = 0
    while start < total:
        end = min(start + max_tokens, total)
        chunk_tokens = tokens[start:end]
        chunk_text = encoding.decode(chunk_tokens)
        chunks.append(chunk_text)
        if end == total:
            break
        start = end - overlap  # slide the window with overlap
    return chunks

def split_markdown_file(filepath: str, max_tokens: int = MAX_TOKENS, overlap: int = OVERLAP_TOKENS) -> dict:
    """
    Reads a markdown file and splits it into chunks.
    It uses all markdown headings (levels 1-6) as split points.
    Also, it extracts the article head (text before the first heading).
    If a section exceeds max_tokens, it is further split via a sliding window.
    Returns a dict with keys:
      - 'head': overall article text (used as summary)
      - 'chunks': a list of dicts with keys 'heading' and 'chunk_text'
    """
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    # Pattern for markdown headings (levels 1 to 6)
    heading_regex = re.compile(r"^(#{1,6}\s+.*)$", re.MULTILINE)
    matches = list(heading_regex.finditer(content))
    
    # The article head is text before the first heading (if any)
    if matches:
        first_header_start = matches[0].start()
        article_head = content[:first_header_start].strip()
    else:
        article_head = content.strip()
    
    # Now, split the document into sections based on the heading positions.
    sections = []
    # If there is initial text (head) without a heading, treat it as one section.
    if article_head:
        sections.append({"heading": "Article Head", "text": article_head})
    
    for i, match in enumerate(matches):
        heading = match.group().strip()
        start = match.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(content)
        section_text = content[start:end].strip()
        # Prepend the header line to the section text if desired
        full_text = f"{heading}\n\n{section_text}"
        sections.append({"heading": heading, "text": full_text})
    
    # Now, for each section, if its token count exceeds max_tokens, further split it via sliding window.
    chunks = []
    for section in sections:
        section_text = section["text"]
        token_count = count_tokens(section_text)
        if token_count > max_tokens:
            subchunks = split_text_sliding_window(section_text, max_tokens, overlap)
            # Each subchunk inherits the section heading
            for sub in subchunks:
                chunks.append({"heading": section["heading"], "chunk_text": sub})
        else:
            chunks.append({"heading": section["heading"], "chunk_text": section_text})
    
    # For the overall article summary, we use the article head.
    return {"head": article_head, "chunks": chunks}

# ---------------------------
# (Optional) Summarization Function
# ---------------------------
def summarize_article_head(article_head: str) -> str:
    """
    Generate an overall summary for the article.
    In this example, we simply use the first 100 tokens (or a similar heuristic).
    In practice, you might call an LLM summarizer.
    """
    tokens = encoding.encode(article_head)
    summary_tokens = tokens[:min(100, len(tokens))]
    summary = encoding.decode(summary_tokens)
    return summary

# ---------------------------
# (2) Prepare Prompt and Call DeepSeek R1
# ---------------------------
def prepare_prompt(article_summary: str, chunk_text: str) -> str:
    """
    Prepares the prompt for DeepSeek R1.
    The prompt includes:
      - Overall Article Summary (from the article head)
      - The current chunk's text
      - Extraction instructions
    """
    prompt = (
        "Article Summary:\n" + article_summary + "\n\n" +
        "Chunk Text:\n" + chunk_text + "\n\n" +
        "Instructions: Analyze the above text, explain your reasoning inside <think> tags, "
        "and output a structured knowledge graph in JSON format inside <answer> tags. "
        "The JSON should have two keys: 'nodes' and 'edges'. For example:\n"
        '{"nodes": [{"id": 1, "label": "EntityName", "type": "EntityType"}, ...],\n'
        '"edges": [{"source": 1, "target": 2, "relation": "RELATION_TYPE"}, ...]}\n'
        "Ensure your output strictly follows this format."
    )
    return prompt

def call_deepseek(prompt: str):
    """
    Calls the deepseek/deepseek-r1 model using the OpenAI chat completions interface.
    Returns the raw output.
    """
    try:
        completion = client.chat.completions.create(
              extra_body={
                  "include_reasoning": True
              },
            model="deepseek/deepseek-r1",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        output = completion.choices[0].message.content
        thinking_process = completion.choices[0].message.model_extra.get("reasoning")
        e = ""
    except Exception as e:
        print("API call failed:", e)
        output = ""
        thinking_process = ""
    return output, thinking_process, e

# ---------------------------
# (3) Persist Data in Tables
# ---------------------------
def persist_results(results: list, output_csv: str = "rl_training_data.csv"):
    """
    Persists a list of results (each a dict) into a CSV file.
    """
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

# ---------------------------
# Main Script
# ---------------------------
def main():
    # Directory containing your Markdown files
    docs_dir = "data_raw"
    result_list = []

    for filename in os.listdir(docs_dir):
        if filename.endswith(".md"):
            filepath = os.path.join(docs_dir, filename)
            print(f"Processing {filepath}...")
            file_data = split_markdown_file(filepath, MAX_TOKENS, OVERLAP_TOKENS)
            # Use the article head from file_data as the overall summary
            article_head = file_data["head"]
            if article_head:
                article_summary = summarize_article_head(article_head)
            else:
                article_summary = ""
            # Process each chunk
            for idx, chunk in enumerate(file_data["chunks"]):
                chunk_text = chunk["chunk_text"]
                # Build the prompt using the overall article summary and the current chunk text
                prompt = prepare_prompt(article_summary, chunk_text)
                # Call the DeepSeek R1 model
                api_output, thinking_process, error = call_deepseek(prompt)
                # Append the results with metadata to result_list
                result_list.append({
                    "file": filename,
                    "chunk_index": idx,
                    "heading": chunk["heading"],
                    "chunk_text": chunk_text,
                    "article_summary": article_summary,
                    "prompt": prompt,
                    "api_output": api_output,
                    "thinking_process": thinking_process,
                    "error": error,
                })
    
    # Persist the results to a CSV file
    persist_results(result_list)

if __name__ == "__main__":
    # main()
    pass


这段代码是一个知识图谱提取工具，主要功能是从Markdown文档中提取结构化的知识图谱。整个流程分为以下几个部分：

1. 配置与初始化
使用tiktoken库计算文本的token数量
设置分块参数：最大token数(1500)和重叠token数(150)

2. 文本处理功能
文本分割：将长文本按照Markdown标题分块，并确保每块不超过token限制
滑动窗口：对超长段落使用重叠滑动窗口技术分割，保证语义连贯性
文章摘要：从文章头部提取摘要信息

3. AI模型交互
构建提示模板，包含文章摘要、当前文本块和指令
调用DeepSeek R1模型API，提取知识图谱
收集模型输出和推理过程

4. 数据处理与存储
将所有处理结果以结构化形式收集
保存为CSV格式，包含文件名、块索引、标题、原文、摘要、提示、API输出和推理过程

#### 主要流程
遍历data_raw目录中的所有Markdown文件
将每个文件分割成合适大小的文本块
对每个文本块构建提示并调用AI模型
收集结果并保存为训练数据集
该代码特别适合用于构建知识图谱训练数据集，或分析AI模型的推理能力。

┌─────────────────────────┐
│ Configuration & Setup   │
│ - tiktoken for tokens   │
│ - max_tokens: 1500      │
│ - overlap: 150          │
└───────────┬─────────────┘
            ▼
┌─────────────────────────┐
│ Read Markdown Files     │
│ from data_raw directory │
└───────────┬─────────────┘
            ▼
┌─────────────────────────┐
│ For each file:          │
└───────────┬─────────────┘
            ▼
┌─────────────────────────┐
│ Text Processing         │
│ ┌───────────────────┐   │
│ │ Extract Summary   │   │
│ └────────┬──────────┘   │
│          ▼              │
│ ┌───────────────────┐   │
│ │ Split by Headers  │   │
│ └────────┬──────────┘   │
│          ▼              │
│ ┌───────────────────┐   │
│ │ Sliding Window    │   │
│ │ for Long Sections │   │
│ └────────┬──────────┘   │
└──────────┬──────────────┘
           ▼
┌─────────────────────────┐
│ For each text chunk:    │
└───────────┬─────────────┘
            ▼
┌─────────────────────────┐
│ AI Model Interaction    │
│ ┌───────────────────┐   │
│ │ Build Prompt      │   │
│ │ with Summary,     │   │
│ │Chunk & Instructions│  │
│ └────────┬──────────┘   │
│          ▼              │
│ ┌───────────────────┐   │
│ │ Call DeepSeek R1  │   │
│ │ API               │   │
│ └────────┬──────────┘   │
│          ▼              │
│ ┌───────────────────┐   │
│ │ Collect Output &  │   │
│ │ Reasoning Process │   │
│ └────────┬──────────┘   │
└──────────┬──────────────┘
           ▼
┌─────────────────────────┐
│ Data Processing         │
│ & Storage               │
│ ┌───────────────────┐   │
│ │ Structure Results │   │
│ └────────┬──────────┘   │
│          ▼              │
│ ┌───────────────────┐   │
│ │ Save as CSV with  │   │
│ │ all metadata and  │   │
│ │ extracted content │   │
│ └───────────────────┘   │
└─────────────────────────┘

In [19]:
main()

Processing data_raw/2024_in_science.md...
Processing data_raw/year-in-review-the-biggest-climate-headlines-of-2024.md...
Processing data_raw/2024-wrapped-for-african-startups-a-year-of-growth-and-challenges.md...
Processing data_raw/2024_in_spaceflight.md...
Results saved to rl_training_data.csv


In [None]:
# the parallel version, so you don't have to wait a long time if your r1 could handle batch calls

# import os
# import re
# import json
# import pandas as pd
# import tiktoken
# import concurrent.futures

# # ---------------------------
# # Configuration & API Setup
# # ---------------------------

# # Set your API key (if using OpenAI’s client for deepseek/deepseek-r1)
# # For example, if using an openai-like client, you might set:
# # openai.api_key = "YOUR_API_KEY"

# # For this example, we assume a 'client' object is available.
# # If you're using the OpenAI library, you might import it and set up accordingly.
# # import openai

# # Choose the encoding – here we use a model’s encoding (adjust if needed)
# encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")  # or any model you prefer

# # Maximum token limit per chunk and overlap tokens for sliding window
# MAX_TOKENS = 1500
# OVERLAP_TOKENS = 150

# # Batch size for concurrent API calls
# BATCH_SIZE = 32

# # ---------------------------
# # Helper Functions
# # ---------------------------
# def count_tokens(text: str) -> int:
#     """Count the tokens in text using tiktoken."""
#     return len(encoding.encode(text))

# def split_text_sliding_window(text: str, max_tokens: int = MAX_TOKENS, overlap: int = OVERLAP_TOKENS) -> list:
#     """
#     Splits text into multiple overlapping chunks if it exceeds max_tokens.
#     Returns a list of text chunks.
#     """
#     tokens = encoding.encode(text)
#     total = len(tokens)
#     chunks = []
#     start = 0
#     while start < total:
#         end = min(start + max_tokens, total)
#         chunk_tokens = tokens[start:end]
#         chunk_text = encoding.decode(chunk_tokens)
#         chunks.append(chunk_text)
#         if end == total:
#             break
#         start = end - overlap  # slide the window with overlap
#     return chunks

# def split_markdown_file(filepath: str, max_tokens: int = MAX_TOKENS, overlap: int = OVERLAP_TOKENS) -> dict:
#     """
#     Reads a markdown file and splits it into chunks.
#     It uses all markdown headings (levels 1-6) as split points.
#     Also, it extracts the article head (text before the first heading).
#     If a section exceeds max_tokens, it is further split via a sliding window.
#     Returns a dict with keys:
#       - 'head': overall article text (used as summary)
#       - 'chunks': a list of dicts with keys 'heading' and 'chunk_text'
#     """
#     with open(filepath, "r", encoding="utf-8") as f:
#         content = f.read()

#     # Pattern for markdown headings (levels 1 to 6)
#     heading_regex = re.compile(r"^(#{1,6}\s+.*)$", re.MULTILINE)
#     matches = list(heading_regex.finditer(content))
    
#     # The article head is text before the first heading (if any)
#     if matches:
#         first_header_start = matches[0].start()
#         article_head = content[:first_header_start].strip()
#     else:
#         article_head = content.strip()
    
#     # Now, split the document into sections based on the heading positions.
#     sections = []
#     # If there is initial text (head) without a heading, treat it as one section.
#     if article_head:
#         sections.append({"heading": "Article Head", "text": article_head})
    
#     for i, match in enumerate(matches):
#         heading = match.group().strip()
#         start = match.end()
#         end = matches[i+1].start() if i+1 < len(matches) else len(content)
#         section_text = content[start:end].strip()
#         # Prepend the header line to the section text if desired
#         full_text = f"{heading}\n\n{section_text}"
#         sections.append({"heading": heading, "text": full_text})
    
#     # Now, for each section, if its token count exceeds max_tokens, further split it via sliding window.
#     chunks = []
#     for section in sections:
#         section_text = section["text"]
#         token_count = count_tokens(section_text)
#         if token_count > max_tokens:
#             subchunks = split_text_sliding_window(section_text, max_tokens, overlap)
#             for sub in subchunks:
#                 chunks.append({"heading": section["heading"], "chunk_text": sub})
#         else:
#             chunks.append({"heading": section["heading"], "chunk_text": section_text})
    
#     return {"head": article_head, "chunks": chunks}

# def summarize_article_head(article_head: str) -> str:
#     """
#     Generate an overall summary for the article.
#     In this example, we simply use the first 100 tokens.
#     In practice, you might call an LLM summarizer.
#     """
#     tokens = encoding.encode(article_head)
#     summary_tokens = tokens[:min(100, len(tokens))]
#     summary = encoding.decode(summary_tokens)
#     return summary

# # ---------------------------
# # (2) Prepare Prompt and Call DeepSeek R1
# # ---------------------------
# def prepare_prompt(article_summary: str, chunk_text: str) -> str:
#     """
#     Prepares the prompt for DeepSeek R1.
#     The prompt includes:
#       - Overall Article Summary (from the article head)
#       - The current chunk's text
#       - Extraction instructions
#     """
#     prompt = (
#         "Article Summary:\n" + article_summary + "\n\n" +
#         "Chunk Text:\n" + chunk_text + "\n\n" +
#         "Instructions: Analyze the above text, explain your reasoning inside <think> tags, "
#         "and output a structured knowledge graph in JSON format inside <answer> tags. "
#         "The JSON should have two keys: 'nodes' and 'edges'. For example:\n"
#         '{"nodes": [{"id": 1, "label": "EntityName", "type": "EntityType"}, ...],\n'
#         '"edges": [{"source": 1, "target": 2, "relation": "RELATION_TYPE"}, ...]}\n'
#         "Ensure your output strictly follows this format."
#     )
#     return prompt

# def call_deepseek(prompt: str):
#     """
#     Calls the deepseek/deepseek-r1 model using the OpenAI chat completions interface.
#     Returns a tuple of (api_output, thinking_process, error).
#     """
#     try:
#         # Replace 'client.chat.completions.create' with your actual API call.
#         # For example, if using OpenAI's client:
#         completion = client.chat.completions.create(
#             extra_body={"include_reasoning": True},
#             model="deepseek/deepseek-r1",
#             messages=[{"role": "user", "content": prompt}]
#         )
#         output = completion.choices[0].message.content
#         # Assume reasoning is returned in model_extra (if available)
#         thinking_process = completion.choices[0].message.model_extra.get("reasoning", "")
#         error = ""
#     except Exception as e:
#         print("API call failed:", e)
#         output = ""
#         thinking_process = ""
#         error = str(e)
#     return output, thinking_process, error

# def process_chunk(metadata: dict) -> dict:
#     """
#     Processes a single chunk using the prepared prompt and returns the metadata updated
#     with API output.
#     """
#     prompt = metadata["prompt"]
#     api_output, thinking_process, error = call_deepseek(prompt)
#     metadata["api_output"] = api_output
#     metadata["thinking_process"] = thinking_process
#     metadata["error"] = error
#     return metadata

# # ---------------------------
# # (3) Persist Data in Tables
# # ---------------------------
# def persist_results(results: list, output_csv: str = "rl_training_data.csv"):
#     """
#     Persists a list of results (each a dict) into a CSV file.
#     """
#     df = pd.DataFrame(results)
#     df.to_csv(output_csv, index=False)
#     print(f"Results saved to {output_csv}")

# # ---------------------------
# # Main Script with Batch Processing
# # ---------------------------
# def main():
#     docs_dir = "data_raw"
#     all_metadata = []

#     # Process each markdown file and accumulate chunk metadata
#     for filename in os.listdir(docs_dir):
#         if filename.endswith(".md"):
#             filepath = os.path.join(docs_dir, filename)
#             print(f"Processing {filepath}...")
#             file_data = split_markdown_file(filepath, MAX_TOKENS, OVERLAP_TOKENS)
#             article_head = file_data["head"]
#             article_summary = summarize_article_head(article_head) if article_head else ""
            
#             for idx, chunk in enumerate(file_data["chunks"]):
#                 chunk_text = chunk["chunk_text"]
#                 prompt = prepare_prompt(article_summary, chunk_text)
#                 meta = {
#                     "file": filename,
#                     "chunk_index": idx,
#                     "heading": chunk["heading"],
#                     "chunk_text": chunk_text,
#                     "article_summary": article_summary,
#                     "prompt": prompt
#                 }
#                 all_metadata.append(meta)
    
#     # Process chunks in parallel using a ThreadPoolExecutor with batch size BATCH_SIZE.
#     results = []
#     with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
#         # Map all_metadata to process_chunk concurrently
#         futures = [executor.submit(process_chunk, meta) for meta in all_metadata]
#         for future in concurrent.futures.as_completed(futures):
#             result = future.result()
#             results.append(result)
    
#     # Persist the results to a CSV file
#     persist_results(results)

# if __name__ == "__main__":
#     main()

## Data Wangling

### Human Editing

- removed first part per each article, it's the summary section of the article
- removed dirty items during the execution

### Polishing of data

- we now mark those with too long reasoning process
- we now count and validate the output of edges and nodes extracted

In [24]:
import pandas as pd
import json
import re
import tiktoken

# ---------------------------
# Configuration for tiktoken
# ---------------------------
# Use the appropriate encoding for your model.
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def count_tokens(text: str) -> int:
    """Count tokens in the given text using tiktoken."""
    try:
        return len(encoding.encode(text))
    except Exception as e:
        return 0

def extract_json_from_answer(api_output: str) -> str:
    """
    Extracts the content inside <answer> ... </answer> tags.
    Returns None if not found.
    """
    match = re.search(r"<answer>(.*?)</answer>", api_output, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

def validate_json(json_text: str):
    """
    Validates and parses JSON from text.
    Returns (parsed_object, True) if valid; otherwise, (None, False).
    """
    try:
        data = json.loads(json_text)
        return data, True
    except Exception as e:
        return None, False

def polish_row(row: pd.Series) -> pd.Series:
    """
    Processes a single row from the dataset:
      - Counts tokens in 'thinking_process' and marks if too long.
      - Extracts and validates JSON from 'api_output' (the content inside <answer> tags).
      - If JSON is valid, counts nodes and edges.
    """
    # Process thinking_process
    tp = row.get("thinking_process", "")
    tp_token_count = count_tokens(tp)
    row["thinking_process_token_count"] = tp_token_count
    row["bad_reasoning"] = tp_token_count >= 7800

    # Process api_output for JSON extraction
    api_output = row.get("api_output", "")
    json_text = extract_json_from_answer(str(api_output))
    if not json_text:
        row["bad_extraction"] = True
        row["node_count"] = None
        row["edge_count"] = None
    else:
        parsed_json, is_valid = validate_json(json_text)
        row["bad_extraction"] = not is_valid
        if is_valid and isinstance(parsed_json, dict):
            nodes = parsed_json.get("nodes", [])
            edges = parsed_json.get("edges", [])
            row["node_count"] = len(nodes)
            row["edge_count"] = len(edges)
        else:
            row["node_count"] = None
            row["edge_count"] = None

    # (Optional) Additional polishing steps can be added here
    # For example: count tokens in prompt or chunk_text, flag if missing fields, etc.
    
    return row

def polish_dataset(input_csv: str, output_csv: str):
    """
    Loads the dataset from a CSV file, applies polishing to each row,
    and saves the polished dataset into another CSV file.
    """
    df = pd.read_csv(input_csv)
    df = df.apply(polish_row, axis=1)
    df.to_csv(output_csv, index=False)
    print(f"Polished dataset saved to {output_csv}")

if __name__ == "__main__":
    polish_dataset("rl_training_data_human_reviewed.csv", "polished_rl_training_data.csv")


Polished dataset saved to polished_rl_training_data.csv


After performing this, we could see that one data entry got truncated in its reasoning with more than 8000 tokens during reasoning, and a couple(5) of entries got truncated during graph extraction in output.

In [27]:
df = pd.read_csv("polished_rl_training_data.csv")
filtered_df = df[(df['bad_reasoning'] == False) & (df['bad_extraction'] == False)]

In [28]:
filtered_df

Unnamed: 0,file,chunk_index,heading,chunk_text,article_summary,prompt,api_output,thinking_process,error,thinking_process_token_count,bad_reasoning,bad_extraction,node_count,edge_count
0,2024_in_science.md,1,### January,### January\n\n- 2 January – The [Japan Meteor...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n\n## 2024 in science\n\n> UR...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,1053,False,False,25.0,17.0
1,2024_in_science.md,2,### January,](https://en.wikipedia.org/wiki/Extinction) of...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,988,False,False,34.0,23.0
2,2024_in_science.md,3,### January,"en.wikipedia.org/wiki/101955_Bennu), after thr...",## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by reading through the provi...",,2919,False,False,32.0,21.0
3,2024_in_science.md,4,### January,-first_century.webp)[24 January](https://en.wi...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by reading through the provi...",,4959,False,False,36.0,23.0
4,2024_in_science.md,5,### January,2024_in_science#cite_note-81)[[82]](https://en...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,1368,False,False,17.0,12.0
5,2024_in_science.md,6,### February,### February\n\n- 2 February\n - Scientists r...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,2717,False,False,28.0,18.0
6,2024_in_science.md,7,### February,.wikipedia.org/wiki/Mobile_phone).[[114]](http...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","\nOkay, let's tackle this problem step by step...",,1085,False,False,17.0,14.0
7,2024_in_science.md,8,### February,"or pollution-mediated, deaths from automobili...",## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,3960,False,False,30.0,19.0
8,2024_in_science.md,9,### February,[prebiotic](https://en.wikipedia.org/wiki/Pre...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n## 2024 in science\n\n> URL ...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,1963,False,False,40.0,23.0
9,2024_in_science.md,10,### February,.wikipedia.org/wiki/2024_in_science#cite_note-...,## 2024 in science\n\n> URL Source: https://en...,Article Summary:\n\n## 2024 in science\n\n> UR...,"<answer>\n{\n ""nodes"": [\n {""id"": 1, ""labe...","Okay, let's start by analyzing the given text....",,5365,False,False,33.0,21.0


In [31]:
data = filtered_df.to_dict(orient='records')

In [33]:
dataset = []


SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

for row in data:
    # Use the prompt column from your CSV as the user prompt.
    user_prompt = row["prompt"]
    # Use the thinking_process and api_output to create the ground-truth answer.
    reasoning = row["thinking_process"]
    final_answer = extract_json_from_answer(row["api_output"])
    
    # Format the ground-truth answer using the XML_COT_FORMAT.
    ground_truth = XML_COT_FORMAT.format(reasoning=reasoning, answer=final_answer)
    
    # Create the training example with a list of messages.
    dataset.append({
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ],
        "answer": ground_truth
    })