In [4]:
from graphviz import Digraph

dot = Digraph("dpo_pipeline", format="png")
dot.attr(rankdir="LR", fontsize="10")

# Raw datasets
dot.node("dad_raw", "Dad jokes\n(dad_jokes_splitted_final.csv)", shape="folder")
dot.node("redit_Jokes_raw", "redit_Jokes set\n(jokes_redit_Jokes_set.csv)", shape="folder")
dot.node("million_raw", "One Million Reddit Jokes\n(SocialGrep/one-million-reddit-jokes)", shape="folder")

# Cleaning / dedup stages
dot.node("dad_clean", "Clean + dedup\n(question/response)", shape="box")
dot.node("redit_Jokes_clean", "Clean + strip tails\n(body/punchline)\n+ dedup", shape="box")
dot.node("million_clean", "Clean + strip tails\n(title/selftext)\n+ dedup", shape="box")

# DPO pairs per source
dot.node("dad_pairs", "Dad DPO pairs\n(dad_jokes_dpo_pairs.csv)", shape="box")
dot.node("redit_Jokes_pairs", "redit_Jokes-set DPO pairs\n(redit_Jokes_set_dpo.csv)", shape="box")
dot.node("million_pairs", "Million DPO pairs\n(pairs_dpo_one_million.csv)", shape="box")

# Merge and final
dot.node("merge_priority", "Merge with priority:\nDad > redit_Jokes > Million\n+ length filters", shape="box")
dot.node("final", "Final DPO dataset\n(dpo_final_set.csv)", shape="doubleoctagon")

# Edges
dot.edges([
    ("dad_raw", "dad_clean"),
    ("redit_Jokes_raw", "redit_Jokes_clean"),
    ("million_raw", "million_clean"),
])

dot.edges([
    ("dad_clean", "dad_pairs"),
    ("redit_Jokes_clean", "redit_Jokes_pairs"),
    ("million_clean", "million_pairs"),
])

dot.edge("dad_pairs", "merge_priority", label="primary")
dot.edge("redit_Jokes_pairs", "merge_priority", label="secondary")
dot.edge("million_pairs", "merge_priority", label="fallback")

dot.edge("merge_priority", "final")

# Render to docs/
dot.render("docs/dpo_pipeline", cleanup=True)
print("Saved docs/dpo_pipeline.png")


Saved docs/dpo_pipeline.png
